0. Load libraries, scripts and data

### Libraries
library(magrittr)
library(ArchR)
library(ggpubr)
library(purrr)
library(Seurat)
library(parallel)
library(scDblFinder)
library(qs)
source("plotCellQuality.R")
source("plotUmapQC.R")
source("plotUmapClusters.R")
source("plotUmaps.R")
### Set ArchR parameters.
addArchRThreads(threads = 8) # number of parallel threads
addArchRGenome("hg38") # genome and gene annotation ("hg19", "hg38", "mm9", "mm10")
### Set HDF5 environment variables to prevent HDF5 error later
Sys.setenv(HDF5_USE_FILE_LOCKING=FALSE)
Sys.setenv(RHDF5_USE_FILE_LOCKING=FALSE)
### Samples
samples <- c("PBMC_scATAC", "PBMC_scTurboATAC")

### Sample palette
sample_palette <- c("grey", "blue")
names(sample_palette) <- samples
### Data
inputFiles <- c("PBMC_10xTn5_CellRangerATAC_fragments.tsv.gz",
                "PBMC_Tn5hc_CellRangerATAC_fragments.tsv.gz")
names(inputFiles) <- samples

1. Create Arrow Files

overview
ArrowFiles <- createArrowFiles(
  inputFiles = inputFiles,
  sampleNames = names(inputFiles),
  minTSS = 0, #Dont set this too high because you can always increase later
  minFrags = 100,
  maxFrags = 5000000,
  addTileMat = TRUE,
  addGeneScoreMat = FALSE,
  verbose = FALSE, 
  subThreading = FALSE 
)
names(ArrowFiles) <- gsub("\\.arrow", "", ArrowFiles)

2. Create and preprocess ArchRProject

overview
proj_list <- list()

for (sample in samples){
    proj_list[[sample]] <- ArchRProject(
      ArrowFiles = ArrowFiles[[sample]], 
      outputDirectory = paste0("ArchRProj_", sample),
      copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
    )
}
proj <- ArchRProject(
      ArrowFiles = ArrowFiles, 
      outputDirectory = "ArchRProj_all",
      copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
    )

a) Quality Control

### Fragment Size distributions
fragsize_samples <- plotFragmentSizes(ArchRProj = proj, pal = sample_palette) + theme(plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=0), legend.title = element_text(size=0), legend.text = element_text(size=16)) +
ggtitle("Fragment size distribution")

### TSS enrichment profiles
tssprofiles_samples <- plotTSSEnrichment(ArchRProj = proj, pal = sample_palette) + theme(plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=0), legend.title = element_text(size=0), legend.text = element_text(size=16)) +
ggtitle("TSS enrichment profile")
options(repr.plot.width=32, repr.plot.height=10)
ggAlignPlots(fragsize_samples, tssprofiles_samples, type = "h")

b) Sample Statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=20, repr.plot.height=12)
ggarrange(log10frags_samples, tss_samples, ncol = 2)

c) Cell Statistics

minTSS <- c(10, 8)
names(minTSS) <- samples
minFrag <- c(4.1, 4.4)
names(minFrag) <- samples
p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, set_xlim = c(2, 6), set_ylim = c(0, 45), 
                                                           cutoff_frags = minFrag[sample], cutoff_tss = minTSS[sample])})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=13)
ggarrange(plotlist = p_list, ncol = 2, nrow = 1)

3. Filtering I

overview

a) Filtering by minimal number of fragments

Filter cells by minFrag cutoff for a normal distribution of unique fragments per cell.

  • PBMC_scATAC: 10^4.1
  • PBMC_scTurboATAC: 10^4.4
minFrag <- c(4.1, 4.4)
names(minFrag) <- samples
proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[which(getCellColData(proj_list[[x]], select = "log10(nFrags)")[,1] >= minFrag[x])], ]})
names(proj_list) <- samples

b) Filtering by minimal TSS enrichment score

Filter cells by minTSS cutoff for a normal distribution of TSS enrichment scores per cell.

  • PBMC_scATAC: 10
  • PBMC_scTurboATAC: 8
minTSS <- c(10, 8)
names(minTSS) <- samples
proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[which(getCellColData(proj_list[[x]], select = "TSSEnrichment")[,1] > minTSS[x])], ]})
names(proj_list) <- samples
for (sample in samples){
    print(sample)
    nFiltered <- getCellColData(proj_list[[sample]]) %>% nrow(.)
    print(paste0("Filtered cells: ", nFiltered))
}
[1] "PBMC_scATAC"
[1] "Filtered cells: 7658"
[1] "PBMC_scTurboATAC"
[1] "Filtered cells: 8309"

c) Sample Statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=32, repr.plot.height=20)
ggarrange(log10frags_samples, tss_samples, ncol = 2)

d) Cell Statistics

options(repr.plot.width=30, repr.plot.height=13)
p <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, cutoff_frags = minFrag[[sample]], cutoff_tss = minTSS[[sample]])}) %>% ggarrange(plotlist = ., ncol = 2, nrow = 1)
p

e) Separate single-cell embeddings

proj_list <- lapply(proj_list, function(x){addIterativeLSI(
    ArchRProj = x,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = 1:30,
    force = TRUE
)})
IterativeLSI_list <- lapply(proj_list, function(x){getReducedDims(x, "IterativeLSI", returnMatrix = FALSE)})
options(repr.plot.width=21, repr.plot.height=5)
par(mfrow=c(1, 3))
for (x in samples){
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', main=x, ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1)
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1, ylim=c(0,1500))
    plot(IterativeLSI_list[[x]]$corToDepth$none, bty='l', xlab='LSI components', ylab='Correlation to sequencing depth', pch=16, cex.lab=1.2, cex.axis=1.1)
}
LSIcomponents <- list(2:22, 2:21)
names(LSIcomponents) <- samples
proj_list <- lapply(names(proj_list), function(x){addIterativeLSI(
    ArchRProj = proj_list[[x]],
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = LSIcomponents[[x]],
    force = TRUE
)})

names(proj_list) <- samples
proj_list <- lapply(proj_list, function(x){addUMAP(
    ArchRProj = x, 
    reducedDims = "IterativeLSI", 
    name = "UMAP", 
    nNeighbors = 30, 
    minDist = 0.5, 
    metric = "cosine",
    force = TRUE
)})
umap_list <- lapply(names(proj_list), function(x){plotUmapQC(proj_list[[x]], "UMAP", "Sample", x, doubletScore=FALSE, coloringPalette=sample_palette)})
names(umap_list) <- names(proj_list)
options(repr.plot.width=25, repr.plot.height=40)
lapply(umap_list, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=2)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=2)

4. Doublet detection

overview

a) Amulet

amulet_res <- lapply(samples, function(sample){res <- amulet(inputFiles[sample],
                                                             barcodes = rownames(proj_list[[sample]]@cellColData) %>% gsub(paste0(sample, "#"), "", .),
                                                             regionsToExclude = GRanges(c("M","chrM","MT","X","Y","chrX","chrY"), IRanges(1L, width=10^8)), # excluding repeats, as well as sex and mitochondrial chromosomes
                                                             uniqueFrags = TRUE, # only use unique fragments
                                                             maxFragSize = 1000L, # maximum fragment size to consider
                                                             removeHighOverlapSites = TRUE); # remove sites that have more than two reads in unexpectedly many cells
                                               rownames(res) <- paste0(sample, "#", rownames(res));
                                               colnames(res) <- paste0("Amulet_", colnames(res));
                                               qsave(res, paste0("Amulet_DoubletDetection_", sample, ".qs"));
                                               return(res)})
for (sample in samples){
    proj_list[[sample]]@cellColData <- cbind(proj_list[[sample]]@cellColData, amulet_res[[sample]][match(rownames(proj_list[[sample]]@cellColData), rownames(amulet_res[[sample]])),])
    
    proj_list[[sample]]@cellColData$Amulet_doublet <- proj_list[[sample]]@cellColData$Amulet_q.value <= 1e-5
}

b) Correlation with sequencing depth

plot_list <- list()

for (sample in samples){
    plot_list[["1"]][[sample]] <- ggplot(as.data.frame(proj_list[[sample]]@cellColData), aes(x = log10(Amulet_nFrags), y = log10(Amulet_q.value))) + ggpointdensity::geom_pointdensity() +
                                    ggtitle(sample) + 
                                    ggpubr::stat_cor(p.accuracy = 0.001, r.accuracy = 0.01, method = "spearman", size = 5) +
                                    theme_minimal() + theme(text = element_text(size = 15))
    plot_list[["2"]][[sample]] <- ggplot(as.data.frame(proj_list[[sample]]@cellColData), aes(x = log10(Amulet_nFrags), y = log10(Amulet_p.value))) + ggpointdensity::geom_pointdensity() +
                                    ggtitle(sample) + 
                                    ggpubr::stat_cor(p.accuracy = 0.001, r.accuracy = 0.01, method = "spearman", size = 5) +
                                    theme_minimal() + theme(text = element_text(size = 15))
}
options(repr.plot.width=25, repr.plot.height=5)
ggarrange(plotlist = plot_list[["1"]], ncol =4)
ggarrange(plotlist = plot_list[["2"]], ncol =4)

c) Inspect doublets

umap_list <- list()

for (sample in samples){
    umap_list[[sample]] <- plotEmbedding(proj_list[[sample]], embedding = "UMAP", 
                                         colorBy = "cellColData", name = "Amulet_doublet",
                                         pal = c("FALSE" = "grey", "TRUE" = "darkred"), size = 0.5)
}
options(repr.plot.width=25, repr.plot.height=10)
ggarrange(plotlist = umap_list, ncol=2, nrow=1)
for (sample in samples){
    print(sample)
    print(table(proj_list[[sample]]@cellColData$Amulet_doublet))
    print(round(sum(proj_list[[sample]]@cellColData$Amulet_doublet) / nrow(proj_list[[sample]]@cellColData), 2))
}
[1] "PBMC_scATAC"

FALSE  TRUE 
 6671   987 
[1] 0.13
[1] "PBMC_scTurboATAC"

FALSE  TRUE 
 7763   546 
[1] 0.07

5. Filtering II

overview

a) Filtering by doublets

proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[!proj_list[[x]]$Amulet_doublet]]})
names(proj_list) <- samples

b) Sample Statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Sample, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=28), legend.text = element_text(size=28)) +
                        scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=20, repr.plot.height=20)
ggarrange(log10frags_samples, tss_samples, ncol = 2)

c) Cell Statistics

options(repr.plot.width=15, repr.plot.height=10)
p <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, cutoff_frags = minFrag[[sample]], cutoff_tss = minTSS[sample])}) %>% ggarrange(plotlist = ., ncol = 2, nrow = 1)
p

d) Separate single-cell embeddings

proj_list <- lapply(proj_list, function(x){addIterativeLSI(
    ArchRProj = x,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = 1:30,
    force = TRUE
)})
IterativeLSI_list <- lapply(proj_list, function(x){getReducedDims(x, "IterativeLSI", returnMatrix = FALSE)})
options(repr.plot.width=21, repr.plot.height=5)
par(mfrow=c(1, 3))
for (x in samples){
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', main=x, ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1)
    plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1, ylim=c(0,1500))
    plot(IterativeLSI_list[[x]]$corToDepth$none, bty='l', xlab='LSI components', ylab='Correlation to sequencing depth', pch=16, cex.lab=1.2, cex.axis=1.1)
}
LSIcomponents <- list(2:21, 2:21)
names(LSIcomponents) <- samples
proj_list <- lapply(names(proj_list), function(x){addIterativeLSI(
    ArchRProj = proj_list[[x]],
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = LSIcomponents[[x]],
    force = TRUE
)})

names(proj_list) <- samples
proj_list <- lapply(proj_list, function(x){addUMAP(
    ArchRProj = x, 
    reducedDims = "IterativeLSI", 
    name = "UMAP", 
    nNeighbors = 30, 
    minDist = 0.5, 
    metric = "cosine",
    force = TRUE
)})
umap_list <- lapply(names(proj_list), function(x){plotUmapQC(proj_list[[x]], "UMAP", "Sample", x, doubletScore=FALSE, coloringPalette=sample_palette)})
names(umap_list) <- names(proj_list)
options(repr.plot.width=25, repr.plot.height=40)
lapply(umap_list, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=2)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=2)

6. Clustering of single cells

overview

a) Evaluate different clustering resolutions

umap_clusters_filtered <- lapply(proj_list, function(x){plotUmapClusters(x, reducedDims="IterativeLSI", 
                                           embedding="UMAP", resolutions = c(0.1, 0.2, 0.3))})
options(repr.plot.width=28, repr.plot.height=20)
lapply(umap_clusters_filtered, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=1)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=2)

b) Cluster cells

proj_list <- lapply(names(proj_list), function(x){addClusters(proj_list[[x]], 
                                                              reducedDims = "IterativeLSI", dimsToUse = LSIcomponents[[x]], 
                                                              method = "Seurat", name = "Clusters", resolution = 0.2)})
names(proj_list) <- samples
lapply(proj_list, function(x){table(x$Clusters)})
$PBMC_scATAC

  C1  C10   C2   C3   C4   C5   C6   C7   C8   C9 
 476  338 1416  869  352  673  931  179   81 1356 

$PBMC_scTurboATAC

  C1  C10  C11  C12  C13  C14   C2   C3   C4   C5   C6   C7   C8   C9 
 181  954  414  416   39  527   69 1530  354  279   61 1612  435  892 

c) Single-cell embeddings

umap_list <- lapply(proj_list, function(x){plotUmaps(x, embeddings = c("UMAP"), coloringLayer = "Clusters")[[1]]})
names(umap_list) <- samples
options(repr.plot.width=20, repr.plot.height=10)
ggarrange(plotlist = umap_list, ncol=2, nrow=1)

d) Sample statistics

log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = x$Clusters, log10nFrags = log10(x$nFrags), Sample = x$Sample)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + facet_wrap(~Sample) + theme(legend.position = "none", plot.title = element_text(size=14), axis.title = element_text(size=12), axis.text = element_text(size=12), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=12), legend.text = element_text(size=12), strip.text = element_text(size = 12)) +
                        scale_fill_manual(values = paletteDiscrete(paste0("C", 1:15))) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = x$Clusters, tss = x$TSSEnrichment, Sample = x$Sample)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) + 
                        theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14), axis.title = element_text(size=12), axis.text = element_text(size=12), axis.text.x=element_text(angle=90, vjust=1, hjust=1), 
                                                legend.title = element_text(size=12), legend.text = element_text(size=12), strip.text = element_text(size = 12)) +
                        scale_fill_manual(values = paletteDiscrete(paste0("C", 1:15))) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score") + facet_wrap(~Sample)
options(repr.plot.width=12, repr.plot.height=10)
ggarrange(log10frags_samples, tss_samples, nrow = 2)

7. Save ArchRProject

### Save ArchRProject
for (sample in samples){
    saveArchRProject(ArchRProj = proj_list[[sample]], outputDirectory = paste0("ArchRProj_", sample), load = FALSE)
}
lapply(names(proj_list), function(x){all(rownames(proj_list[[x]]@cellColData) == rownames(proj_list[[x]]@embeddings$UMAP$df)) %>% print(.);
                                     write.csv(cbind(proj_list[[x]]@embeddings$UMAP$df, proj_list[[x]]@cellColData), paste0("RawData_", x, ".csv"))})

#


Isabelle Seufert
Division of Chromatin Networks, DKFZ
12.09.2023